; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s

define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld2:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB0_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
; CHECK-NEXT:    str q2, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB0_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %3 = fmul fast <8 x float> %wide.vec, %wide.vec
  %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %5 = fadd fast <4 x float> %4, %2
  store <4 x float> %5, ptr %next.gep19, align 4
  %index.next = add i64 %index, 4
  %6 = icmp eq i64 %index.next, 1024
  br i1 %6, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld3:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB1_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
; CHECK-NEXT:    str q3, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB1_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = mul i64 %index, 3
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <12 x float>, ptr %next.gep, align 4
  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  %3 = fmul fast <12 x float> %wide.vec, %wide.vec
  %4 = shufflevector <12 x float> %3, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  %5 = fadd fast <4 x float> %4, %2
  %6 = fmul fast <12 x float> %wide.vec, %wide.vec
  %7 = shufflevector <12 x float> %6, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  %8 = fadd fast <4 x float> %5, %7
  store <4 x float> %8, ptr %next.gep23, align 4
  %index.next = add i64 %index, 4
  %9 = icmp eq i64 %index.next, 1024
  br i1 %9, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld4:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB2_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
; CHECK-NEXT:    add x9, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT:    b.ne .LBB2_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 2
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %wide.vec = load <16 x float>, ptr %next.gep, align 4
  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  %4 = fmul fast <16 x float> %wide.vec, %wide.vec
  %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  %6 = fadd fast <4 x float> %5, %3
  %7 = fmul fast <16 x float> %wide.vec, %wide.vec
  %8 = shufflevector <16 x float> %7, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  %9 = fmul fast <16 x float> %wide.vec, %wide.vec
  %10 = shufflevector <16 x float> %9, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  %11 = fadd fast <4 x float> %10, %8
  %12 = getelementptr inbounds float, ptr %pDst, i64 %1
  %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %11, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x float> %interleaved.vec, ptr %12, align 4
  %index.next = add i64 %index, 4
  %13 = icmp eq i64 %index.next, 1024
  br i1 %13, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: twosrc:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB3_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
; CHECK-NEXT:    add x9, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x9]
; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
; CHECK-NEXT:    str q4, [x2], #16
; CHECK-NEXT:    b.ne .LBB3_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %next.gep23 = getelementptr float, ptr %pSrc2, i64 %1
  %next.gep24 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %wide.vec26 = load <8 x float>, ptr %next.gep23, align 4
  %2 = fmul fast <8 x float> %wide.vec26, %wide.vec
  %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %6 = fadd fast <4 x float> %5, %3
  store <4 x float> %6, ptr %next.gep24, align 4
  %index.next = add i64 %index, 4
  %7 = icmp eq i64 %index.next, 1024
  br i1 %7, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld2_multiuse:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB4_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
; CHECK-NEXT:    str q2, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB4_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %3 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %4 = fadd fast <4 x float> %3, %2
  store <4 x float> %4, ptr %next.gep19, align 4
  %index.next = add i64 %index, 4
  %5 = icmp eq i64 %index.next, 1024
  br i1 %5, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld3_multiuse:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB5_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
; CHECK-NEXT:    str q3, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB5_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = mul i64 %index, 3
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <12 x float>, ptr %next.gep, align 4
  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  %3 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  %4 = fadd fast <4 x float> %3, %2
  %5 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  %6 = fadd fast <4 x float> %4, %5
  store <4 x float> %6, ptr %next.gep23, align 4
  %index.next = add i64 %index, 4
  %7 = icmp eq i64 %index.next, 1024
  br i1 %7, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld4_multiuse:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB6_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
; CHECK-NEXT:    add x9, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT:    b.ne .LBB6_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 2
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %wide.vec = load <16 x float>, ptr %next.gep, align 4
  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  %4 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  %5 = fadd fast <4 x float> %4, %3
  %6 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  %7 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  %8 = fadd fast <4 x float> %7, %6
  %9 = getelementptr inbounds float, ptr %pDst, i64 %1
  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %8, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x float> %interleaved.vec, ptr %9, align 4
  %index.next = add i64 %index, 4
  %10 = icmp eq i64 %index.next, 1024
  br i1 %10, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

; This example has store(shuffle(shuffle(... that would be better to be treated
; as a single store. This avoids the vld2 for data that is already shuffled.
define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldp q0, q1, [x0]
; CHECK-NEXT:    ldp q2, q3, [x0, #32]
; CHECK-NEXT:    trn1 v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    ldp q4, q5, [x0, #64]
; CHECK-NEXT:    trn1 v2.8h, v2.8h, v3.8h
; CHECK-NEXT:    ldp q6, q1, [x0, #96]
; CHECK-NEXT:    trn1 v3.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn1 v3.4s, v0.4s, v3.4s
; CHECK-NEXT:    trn1 v1.8h, v6.8h, v1.8h
; CHECK-NEXT:    trn1 v4.4s, v2.4s, v1.4s
; CHECK-NEXT:    zip2 v0.4s, v3.4s, v4.4s
; CHECK-NEXT:    st2 { v3.2s, v4.2s }, [x0]
; CHECK-NEXT:    str q0, [x0, #64]
; CHECK-NEXT:    ret
entry:
  %0 = load <8 x i16>, ptr %a, align 16
  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
  %1 = load <8 x i16>, ptr %arrayidx1, align 16
  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
  %2 = load <8 x i16>, ptr %arrayidx2, align 16
  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
  %3 = load <8 x i16>, ptr %arrayidx3, align 16
  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
  %4 = load <8 x i16>, ptr %arrayidx5, align 16
  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
  %5 = load <8 x i16>, ptr %arrayidx6, align 16
  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
  %6 = load <8 x i16>, ptr %arrayidx8, align 16
  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
  %7 = load <8 x i16>, ptr %arrayidx9, align 16
  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %vzip.i, ptr %a, align 16
  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
  ret void
}

; Same as above with some different shuffles
define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler2:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldp q0, q2, [x0]
; CHECK-NEXT:    ldp q3, q4, [x0, #32]
; CHECK-NEXT:    mov v0.h[5], v2.h[4]
; CHECK-NEXT:    ldp q5, q6, [x0, #64]
; CHECK-NEXT:    zip1 v3.8h, v3.8h, v4.8h
; CHECK-NEXT:    ldp q7, q2, [x0, #96]
; CHECK-NEXT:    zip1 v4.8h, v5.8h, v6.8h
; CHECK-NEXT:    mov v0.s[1], v4.s[0]
; CHECK-NEXT:    mov v7.h[5], v2.h[4]
; CHECK-NEXT:    uzp1 v1.4s, v3.4s, v7.4s
; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
; CHECK-NEXT:    str q2, [x0, #64]
; CHECK-NEXT:    ret
entry:
  %0 = load <8 x i16>, ptr %a, align 16
  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
  %1 = load <8 x i16>, ptr %arrayidx1, align 16
  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
  %2 = load <8 x i16>, ptr %arrayidx2, align 16
  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
  %3 = load <8 x i16>, ptr %arrayidx3, align 16
  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
  %4 = load <8 x i16>, ptr %arrayidx5, align 16
  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
  %5 = load <8 x i16>, ptr %arrayidx6, align 16
  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
  %6 = load <8 x i16>, ptr %arrayidx8, align 16
  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
  %7 = load <8 x i16>, ptr %arrayidx9, align 16
  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %vzip.i, ptr %a, align 16
  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
  ret void
}


define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) {
; CHECK-LABEL: transpose_s16_8x8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ldr q1, [x0]
; CHECK-NEXT:    ldr q2, [x1]
; CHECK-NEXT:    ldr q0, [x4]
; CHECK-NEXT:    ldr q3, [x2]
; CHECK-NEXT:    ldr q4, [x3]
; CHECK-NEXT:    ldr q6, [x5]
; CHECK-NEXT:    trn1 v5.8h, v1.8h, v2.8h
; CHECK-NEXT:    ldr q16, [x7]
; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT:    ldr q2, [x6]
; CHECK-NEXT:    trn1 v7.8h, v3.8h, v4.8h
; CHECK-NEXT:    trn2 v3.8h, v3.8h, v4.8h
; CHECK-NEXT:    trn1 v4.8h, v0.8h, v6.8h
; CHECK-NEXT:    trn2 v0.8h, v0.8h, v6.8h
; CHECK-NEXT:    trn1 v17.8h, v2.8h, v16.8h
; CHECK-NEXT:    trn2 v2.8h, v2.8h, v16.8h
; CHECK-NEXT:    trn1 v18.4s, v5.4s, v4.4s
; CHECK-NEXT:    trn1 v20.4s, v1.4s, v0.4s
; CHECK-NEXT:    trn2 v4.4s, v5.4s, v4.4s
; CHECK-NEXT:    trn2 v0.4s, v1.4s, v0.4s
; CHECK-NEXT:    trn1 v19.4s, v7.4s, v17.4s
; CHECK-NEXT:    trn1 v21.4s, v3.4s, v2.4s
; CHECK-NEXT:    trn2 v5.4s, v7.4s, v17.4s
; CHECK-NEXT:    trn2 v1.4s, v3.4s, v2.4s
; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x0]
; CHECK-NEXT:    zip2 v2.4s, v18.4s, v19.4s
; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x1]
; CHECK-NEXT:    zip2 v3.4s, v20.4s, v21.4s
; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x2]
; CHECK-NEXT:    zip2 v4.4s, v4.4s, v5.4s
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x3]
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    str q2, [x4]
; CHECK-NEXT:    str q3, [x5]
; CHECK-NEXT:    str q4, [x6]
; CHECK-NEXT:    str q0, [x7]
; CHECK-NEXT:    ret
  %9 = load <8 x i16>, ptr %0, align 16
  %10 = load <8 x i16>, ptr %1, align 16
  %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %13 = load <8 x i16>, ptr %2, align 16
  %14 = load <8 x i16>, ptr %3, align 16
  %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %17 = load <8 x i16>, ptr %4, align 16
  %18 = load <8 x i16>, ptr %5, align 16
  %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %21 = load <8 x i16>, ptr %6, align 16
  %22 = load <8 x i16>, ptr %7, align 16
  %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %25 = bitcast <8 x i16> %11 to <4 x i32>
  %26 = bitcast <8 x i16> %19 to <4 x i32>
  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %29 = bitcast <8 x i16> %12 to <4 x i32>
  %30 = bitcast <8 x i16> %20 to <4 x i32>
  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %33 = bitcast <8 x i16> %15 to <4 x i32>
  %34 = bitcast <8 x i16> %23 to <4 x i32>
  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %37 = bitcast <8 x i16> %16 to <4 x i32>
  %38 = bitcast <8 x i16> %24 to <4 x i32>
  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %41, ptr %0, align 16
  store <4 x i32> %43, ptr %1, align 16
  store <4 x i32> %45, ptr %2, align 16
  store <4 x i32> %47, ptr %3, align 16
  store <4 x i32> %42, ptr %4, align 16
  store <4 x i32> %44, ptr %5, align 16
  store <4 x i32> %46, ptr %6, align 16
  store <4 x i32> %48, ptr %7, align 16
  ret void
}

define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
; CHECK-LABEL: transpose_s16_8x8_:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov x8, x0
; CHECK-NEXT:    mov x9, x0
; CHECK-NEXT:    ldp q1, q2, [x0, #64]
; CHECK-NEXT:    mov x10, x0
; CHECK-NEXT:    ldp q6, q7, [x0, #96]
; CHECK-NEXT:    trn1 v16.8h, v1.8h, v2.8h
; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT:    ldr q0, [x0]
; CHECK-NEXT:    ldr q3, [x8, #16]!
; CHECK-NEXT:    ldr q4, [x9, #32]!
; CHECK-NEXT:    ldr q5, [x10, #48]!
; CHECK-NEXT:    trn1 v2.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn2 v6.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn1 v7.8h, v0.8h, v3.8h
; CHECK-NEXT:    trn2 v0.8h, v0.8h, v3.8h
; CHECK-NEXT:    trn1 v17.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn2 v3.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn1 v4.4s, v7.4s, v16.4s
; CHECK-NEXT:    trn1 v18.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn2 v20.4s, v7.4s, v16.4s
; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v5.4s, v17.4s, v2.4s
; CHECK-NEXT:    trn1 v19.4s, v3.4s, v6.4s
; CHECK-NEXT:    trn2 v21.4s, v17.4s, v2.4s
; CHECK-NEXT:    trn2 v1.4s, v3.4s, v6.4s
; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x0]
; CHECK-NEXT:    zip2 v2.4s, v4.4s, v5.4s
; CHECK-NEXT:    zip2 v3.4s, v18.4s, v19.4s
; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x8]
; CHECK-NEXT:    zip2 v4.4s, v20.4s, v21.4s
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x10]
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x9]
; CHECK-NEXT:    stp q2, q3, [x0, #64]
; CHECK-NEXT:    stp q4, q0, [x0, #96]
; CHECK-NEXT:    ret
  %2 = load <8 x i16>, ptr %0, align 16
  %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
  %4 = load <8 x i16>, ptr %3, align 1
  %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2
  %8 = load <8 x i16>, ptr %7, align 16
  %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3
  %10 = load <8 x i16>, ptr %9, align 16
  %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4
  %14 = load <8 x i16>, ptr %13, align 16
  %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5
  %16 = load <8 x i16>, ptr %15, align 16
  %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6
  %20 = load <8 x i16>, ptr %19, align 16
  %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7
  %22 = load <8 x i16>, ptr %21, align 16
  %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %25 = bitcast <8 x i16> %5 to <4 x i32>
  %26 = bitcast <8 x i16> %17 to <4 x i32>
  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %29 = bitcast <8 x i16> %6 to <4 x i32>
  %30 = bitcast <8 x i16> %18 to <4 x i32>
  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %33 = bitcast <8 x i16> %11 to <4 x i32>
  %34 = bitcast <8 x i16> %23 to <4 x i32>
  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %37 = bitcast <8 x i16> %12 to <4 x i32>
  %38 = bitcast <8 x i16> %24 to <4 x i32>
  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %41, ptr %0, align 16
  store <4 x i32> %43, ptr %3, align 16
  store <4 x i32> %45, ptr %7, align 16
  store <4 x i32> %47, ptr %9, align 16
  store <4 x i32> %42, ptr %13, align 16
  store <4 x i32> %44, ptr %15, align 16
  store <4 x i32> %46, ptr %19, align 16
  store <4 x i32> %48, ptr %21, align 16
  ret void
}

define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v3.4s, v1.4s, v0.4s
; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}

define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2_high:
; CHECK:       // %bb.0:
; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v0.4s, v1.4s, v0.4s
; CHECK-NEXT:    zip1 v1.4s, v2.4s, v0.4s
; CHECK-NEXT:    trn1 v1.4s, v1.4s, v0.4s
; CHECK-NEXT:    zip2 v0.4s, v2.4s, v0.4s
; CHECK-NEXT:    str q1, [x0]
; CHECK-NEXT:    str q0, [x1]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
  %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
  ret void
}

define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2_high2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    zip1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v2.4s, v2.4s, v1.4s
; CHECK-NEXT:    str q2, [x0]
; CHECK-NEXT:    str q0, [x1]
; CHECK-NEXT:    ret
  %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
  %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
  ret void
}

define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: store_factor3:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
; CHECK-NEXT:    ext v6.16b, v1.16b, v2.16b, #12
; CHECK-NEXT:    zip2 v3.4s, v0.4s, v3.4s
; CHECK-NEXT:    mov v3.s[0], v0.s[0]
; CHECK-NEXT:    zip2 v4.4s, v1.4s, v6.4s
; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #12
; CHECK-NEXT:    mov v4.s[0], v1.s[0]
; CHECK-NEXT:    zip2 v5.4s, v2.4s, v0.4s
; CHECK-NEXT:    mov v5.s[0], v2.s[0]
; CHECK-NEXT:    st3 { v3.4s, v4.4s, v5.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  store <12 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}

define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; CHECK-LABEL: store_factor4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    trn1 v4.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v5.4s, v1.4s, v2.4s
; CHECK-NEXT:    trn1 v6.4s, v2.4s, v3.4s
; CHECK-NEXT:    trn1 v7.4s, v3.4s, v0.4s
; CHECK-NEXT:    st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
  store <16 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}
